Loading Data
The following libraries will be used.
library(data.table) # database-like tables
library(ggplot2) # general plots
library(highcharter) # interactive plots
library(DT) # display tables
library(corrplot) # corr SPLOMs
library(vcd) # stats for categories
library(mice) # multiple imputation
library(Boruta) # estimate variable importance
Input is loaded and covnerted to data.table.
test <- read.csv("../input/test.csv")
train <- read.csv("../input/train.csv")
test <- data.table(test)
train <- data.table(train)
The number of dimensions and sampling density is calculated. The datasets are combined.
(N <- nrow(train))
## [1] 1460
(p <- ncol(train) - 2)
## [1] 79
N^(1/p)
## [1] 1.096617
test$SalePrice <- NaN
full <- rbind(train, test)
full$label <- rep(c("train", "test"), c(nrow(train), nrow(test)))
From the data_description variable types can be defined. There are 4 categories: nominal, ordinal, discrete, continuous. A lot of the discrete variables could arguably be handled as ordinals.
nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
"Neighborhood", "Condition1", "Condition2", "HouseStyle",
"RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
"GarageType", "PavedDrive", "MiscFeature", "SaleType",
"SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
"OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
"HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
"GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
"MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
"TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
"YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
"GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
"EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
"MiscVal", "LotFrontage")
A table is created to store feature attributes.
featAttr <- data.table(
name = colnames(test)[-1],
type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]
Special Values
According to data_description several \(NA\) have a special meaning. These values are replaced with more descriptive chr strings. Before, test and train set are combined.
full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]
Nominal, and ordinal variables are converted to factor, continuous to numeric. Discrete variables contain years and measures like number of cars so they will be converted to integer.
for (j in c(nomVars, ordVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
for (j in discVars) full[[j]] <- as.integer(full[[j]])
Ordering
From data_description the order of ordinal variable classes can be derived. These orderings are defined, then the levels of their factors are adjusted.
lvlOrd <- list(
LotShape = c("Reg", "IR1", "IR2", "IR3"),
Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
LandSlope = c("Gtl", "Mod", "Sev"),
BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
OverallQual = 10:1,
OverallCond = 10:1,
ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
MoSold = 1:12
)
for (j in names(lvlOrd)) {
full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}
Some features’ values heavily depend on the value of another feature. For example the values of \(BsmtUnfSF\), \(TotalBsmtSF\) only make sense if features \(BsmtCond\), \(BsmtExposure\), \(BsmtQual\) are not \(noBasement\). This information is given in data_description.
\[ \begin{aligned} BsmtUnfSF, TotalBsmtSF : BsmtCond, BsmtExposure, BsmtQual \ne noBasement\\ BsmtFinSF1 : BsmtFinType1 \ne noBasement\\ BsmtFinSF2 : BsmtFinType2 \ne noBasement \\ GarageYrBlt, GarageCars, GarageArea: GarageType, GarageFinish, GarageQual, GarageCond \ne noGarage \\ PoolArea : PoolQC \ne noPool \\ MiscVal : MiscFeature \ne none \\ MasVnrArea : MasVnrType \ne none \end{aligned} \] For the garage and basement features there are several features which indicate the absence of that feature. If one of them indicates the absence of a feature, the others should too. This can be tested with the difference between two sets.
outersect <- function(x, y) sort(c(setdiff(x, y), setdiff(y, x)))
The differences between Ids of observations with noBasement are computed.
outersect(which(full$BsmtCond == "noBasement"),
which(full$BsmtExposure == "noBasement"))
## [1] 949 1488 2041 2186 2349 2525
outersect(which(full$BsmtCond == "noBasement"),
which(full$BsmtQual == "noBasement"))
## [1] 2041 2186 2218 2219 2525
full[c(949, 1488, 2041, 2186, 2218, 2219, 2349, 2525),
.(Id, BsmtCond, BsmtExposure, BsmtQual, BsmtUnfSF, TotalBsmtSF)]
## Id BsmtCond BsmtExposure BsmtQual BsmtUnfSF TotalBsmtSF
## 1: 949 TA noBasement Gd 936 936
## 2: 1488 TA noBasement Gd 1595 1595
## 3: 2041 noBasement Mn Gd 0 1426
## 4: 2186 noBasement No TA 94 1127
## 5: 2218 Fa No noBasement 173 173
## 6: 2219 TA No noBasement 356 356
## 7: 2349 TA noBasement Gd 725 725
## 8: 2525 noBasement Av TA 240 995
So here the data_description is not consistent. The general condition is set to noBasement but there are values for basement exposure, quality, and so on. noBasement was translated from NAs. Since for all the other basement related variables there are reasonable values it might be that in these cases NA did not mean noBasement.
The same can be done for the garage related variables.
outersect(which(full$GarageType == "noGarage"),
which(full$GarageFinish == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
which(full$GarageQual == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
which(full$GarageCond == "noGarage"))
## [1] 2127 2577
full[c(2127, 2577),
.(GarageYrBlt, GarageCars, GarageArea, GarageType,
GarageFinish, GarageQual, GarageCond)]
## GarageYrBlt GarageCars GarageArea GarageType GarageFinish GarageQual
## 1: NA 1 360 Detchd noGarage noGarage
## 2: NA NA NA Detchd noGarage noGarage
## GarageCond
## 1: noGarage
## 2: noGarage
Here, most garage related features were missing.
Mark Undefined Variables
If e.g. PoolQC is noPool, then there should be a missing vlaue or a zero for PoolArea. If there actually is a value, this could either mean it is a mistake or it means something special that is not clearly described. To at least be consistent, all NAs will be set to 0.
full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := 0]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := 0]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]
\(NA\)s which are left must be true missing data.
naFeats <- apply(full[, !"SalePrice"], 2, function(x) any(is.na(x)))
naObs <- apply(full[, !"SalePrice"], 1, function(x) any(is.na(x)))
Features
The total number of missing values per feature are shown below.
total <- apply(full[, naFeats, with = FALSE], 2, function(j) sum(is.na(j)))
df <- data.frame(
missing = total / nrow(full),
name = factor(names(total), levels = names(total)[order(total)]),
type = featAttr[match(names(total), featAttr$name), type]
)
ggplot(df, aes(x = name, y = missing, fill = type)) +
geom_bar(stat = "identity") +
scale_y_continuous("proportion missing") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Features with Missing Values")
Observations
Below the number of missing values for observations that have missing values is shown. Many observations have 1 or 2, some have 3 or 4 missing values.
total <- apply(full[naObs, ], 1, function(i) sum(is.na(i)))
df <- data.frame(
y = total,
name = full[naObs, Id]
)
highchart() %>%
hc_chart(type = "column") %>%
hc_title(text = "Observations with Missing Values") %>%
hc_xAxis(title = list(text = "observations"), categories = df$name,
labels = list(rotation = "-45")) %>%
hc_yAxis(title = list(text = "missing"), align = "left") %>%
hc_add_series(showInLegend = FALSE, data = list_parse(df[order(df$y), ]))
Feature distributions are described for each type of variable separately.
Continuous variables are very much skewed towards zero. Partly because there are many zeros, partly because they are log-normal distributed. Here, they are log10 transformed to better capture the spread. (log10 because it’s nicer to interpret than log) A new table full2 will hold the transformed values.
full2 <- full
for (j in contVars) {
full2[[j]] <- log10(full[[j]] + 1)
}
Their distributions are shown below.
dt <- melt(full2, "Id", contVars)
ggplot(dt, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ variable, scale = "free") +
theme_bw()
Some distributions are not visible because of all the zeros. The next plot shows the same, excluding zeros.
dt <- melt(full, "Id", contVars)
ggplot(dt, aes(x = value)) +
geom_histogram(bins = 30) +
scale_x_continuous(trans = "log10") +
facet_wrap(~ variable, scale = "free") +
theme_bw()
Summaries for each distribution are computed. There are many undefined variables which are set to zero. This influences the description a lot and is not of particular interest. Therefore, they are excluded.
dt <- full2[, contVars, with = FALSE]
df <- data.frame(
Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)
Relative difference between mean and median (relDelta) are computed as an indicator for skewness. Also, relative cardinality and relative number of outliers are computed. Outliers are defined as being differing at least \(+/- 1.5 MAD\) from the median.
df$relDelta <- abs(df$Mean - df$Median) / df$Mean
df$relCard <- apply(dt, 2, function(x) {
length(unique(x[x > 0])) / length(x[x > 0])
})
df$relOut <- apply(dt, 2, function(x) {
length(boxplot.stats(x[x > 0])$out) / length(x[x > 0])
})
Below is a summary table. There are some outliers for PoolArea but this is because most values in this variable are zero.
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('relDelta',
color = styleInterval(c(0.2, 0.8), c('green', 'orange', 'red'))) %>%
formatStyle('relCard',
color = styleInterval(c(0.2, 0.6), c('red', 'orange', 'green'))) %>%
formatStyle('relOut',
color = styleInterval(c(0.05, 0.1), c('green', 'orange', 'red'))) %>%
formatRound(1:7, 3)
Discrete variables contain years and numerical measures. They are treated as continuous here.
dt <- melt(full, "Id", discVars)
ggplot(dt, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ variable, scale = "free") +
theme_bw()
A summary, as above is computed.
dt <- full[, discVars, with = FALSE]
df <- data.frame(
Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)
Skewness and cardinality would be less interesting here. Instead the modes will be extracted.
Mode <- function(x, fst = TRUE) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
modeIdx <- which.max(tab)
if (fst) return(list(value = ux[modeIdx], freq = tab[modeIdx]))
tab[modeIdx] <- 0
modeIdx <- which.max(tab)
return(list(value = ux[modeIdx], freq = tab[modeIdx]))
}
First and second modes are computed and the table is shown. Modes that make up more than 50% of observations will be highlighted.
df$Mode <- apply(dt, 2, function(x) Mode(x)$value)
df$ModeCount <- apply(dt, 2, function(x) Mode(x)$freq)
df$Mode2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$value)
df$ModeCount2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange'))) %>%
formatRound(2, 0)
In GarageYrBlt the maximum value is 2207, which does not make sense. It seems that this is a typo and the real value is 2007. After checking that there are not more values above 2010 2207 is changed to 2007.
full[GarageYrBlt == 2207, GarageYrBlt := 2007]
Ordinal Variables
dt <- melt(full, "Id", ordVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Nominal Variables
dt <- melt(full, "Id", nomVars)
ggplot(dt, aes(x = value)) +
geom_bar() +
facet_wrap(~ variable, scales = "free") +
theme_bw()
Summary
Cardinality, 1st and 2nd modes are given as summary. Modes that make up more than 50% of observations will be highlighted.
dt <- full[, c(ordVars, nomVars), with = FALSE]
df <- data.frame(
Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
Mode = apply(dt, 2, function(x) Mode(x)$value),
ModeCount = apply(dt, 2, function(x) Mode(x)$freq),
Mode2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$value),
ModeCount2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq),
Type = rep(c("ord", "nom"), c(length(ordVars), length(nomVars)))
)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('Type',
color = styleEqual(unique(df$Type), c('orange', 'blue'))) %>%
formatStyle('ModeCount',
color = styleInterval(nrow(dt) / 2, c("black", 'orange')))
vars <- c(contVars, discVars)
The Pearson Correlation Coefficient is calculated for all feature pairs where both features are continuous. The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link). Here, the log10 transformed values are used.
m <- cor(
full2[, vars, with = FALSE],
use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")
Some clusters are visible. The same hierarchical clustering is shown as tree below.
corrClust <- hclust(dist(m))
plot(corrClust)
vars <- c(ordVars, nomVars)
For categorical features Cramer’s V is calculated for all feature pairs to indicate correlation. The results are shown as heatmap below.
m <- as.matrix(full[, vars, with = FALSE])
m <- m[complete.cases(m), ]
mCramer <- matrix(NA, ncol(m), ncol(m))
for (i in seq_len(ncol(m))) {
mCramer[i, ] <- vapply(
seq_len(ncol(m)),
function(j) assocstats(table(m[, i], m[, j]))$cramer,
numeric(1)
)
}
Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).
colnames(mCramer) <- colnames(m)
rownames(mCramer) <- colnames(m)
corrplot(mCramer, method = "color", order = "hclust")
The tree for the clustering:
corrClust <- hclust(dist(mCramer))
plot(corrClust)
Ordinal variable levels were ordered before. Here, they are converted to integer and then treated as continuous.
for (j in ordVars) {
full2[[j]] <- as.integer(full2[[j]])
}
Except for 1 non-missing observation Utilities consists of only one value with NoSeWa. The rest is AllPub. Correlation cannot be calculated with this. It will be therefore be excldued here.
vars <- c(contVars, discVars, ordVars)
vars <- vars[!vars == "Utilities"]
Now they are compared to continuous and discrete variables.
m <- cor(
full2[, vars, with = FALSE],
use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")
corrClust <- hclust(dist(m))
plot(corrClust)
Single missing values in basement related features, where all other basement related features did have a value, are replaced by their mode.
full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]
For garage related features there are two cases with almost no non-missing value for garage. For these two cases the garage features are set to noGarage.
full[c(2127, 2577), GarageYrBlt := 0]
full[c(2127, 2577), GarageCars := 0]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]
Features with only very few missing values will be imputed using the mode for categories and the median for continuous variables.
full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]
There were two observations with NA values for basement bath related variables in which all other basement related variables were set to noBasement. These will be set to zreo.
full[is.na(BsmtFullBath), BsmtFullBath := 0]
full[is.na(BsmtHalfBath), BsmtHalfBath := 0]
BldgType and LotFrontage have around 10-15% missing values. These will be imputed by Gibbs sampling of the posterior distribution. Variables with missing values are repeatedly predicted given a set of predictor variables. For LotFrontage predictive mean matching, and for BldgType polytomous regression will be used. As predictors all continuous and discrete variables, and the the 2 categorical variables which are most correlated to BldgType are included. Several datasets are created by this method but only one will be used.
vars <- c(contVars, discVars, "BldgType", "MSSubClass", "HouseStyle")
The transformed values will be used.
full2 <- full
for (j in contVars) {
full2[[j]] <- log10(full2[[j]] + 1)
}
dt <- full2[, vars, with = FALSE]
imp <- mice(dt, seed = 42)
LotFrontage
The distribution of LotFrontage after imputation is compared to the original distribution.
df <- data.frame(
LotFrontage = c(dt$LotFrontage, complete(imp)$LotFrontage),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = Distro)) +
geom_density() +
theme_bw()
Both distributions look very similar. Furthermore, its correlation with the 4 most correlated continuous variables is compared.
df <- data.frame(
Value = c(dt$LotArea, dt$X1stFlrSF, dt$GrLivArea, dt$MasVnrArea),
Variable = rep(c("LotArea", "X1stFlrSF", "GrLivArea", "MasVnrArea"),
each = nrow(dt)),
LotFrontage = rep(complete(imp)$LotFrontage, each = 4),
isImputed = factor(is.na(dt$LotFrontage), levels = c(TRUE, FALSE))
)
ggplot(df, aes(x = LotFrontage, y = Value, color = isImputed)) +
geom_point(size = .7) +
facet_wrap( ~ Variable, scales = "free") +
theme_bw()
Imputed values seem to resemble the shape of the original data.
BldgType
Imputed and original distributions are compared.
df <- data.frame(
BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = BldgType, fill = Distro)) +
geom_bar(position = "dodge") +
scale_x_discrete() +
theme_bw()
Next correlation to the closest correlated categorical feature MSSubClass before and after imputation is checked.
df <- data.frame(
BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
MSSubClass = rep(dt$MSSubClass, 2),
Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = MSSubClass, fill = BldgType)) +
geom_bar(position = "dodge") +
facet_grid(Distro ~ .) +
theme_bw()
Both
Finally, the correlation of both imputed variables is checked.
df <- data.frame(
LotFrontage = rep(c(dt$LotFrontage, complete(imp)$LotFrontage), each = 2),
BldgType = rep(c(as.character(dt$BldgType),
as.character(complete(imp)$BldgType)), each = 2),
Distro = rep(c("Original", "Imputed"), each = 2*nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = BldgType)) +
geom_density() +
facet_grid(Distro ~ .) +
theme_bw()
## Warning: Removed 972 rows containing non-finite values (stat_density).
The distribution of \(BldgType = TwnhsE\) changed visibly. However, with less than 10% this value is quite underrepresented so its distribution estimate is less stable. All in all the imputed values seem to be useful
full2$LotFrontage <- complete(imp)$LotFrontage
full2$BldgType <- complete(imp)$BldgType
With many features is might be useful to have an idea about the importance of each feature in order to perform a feature selection.
Here, the correlation of each feature to the target variable is visualized.
train <- full2[label == "train", !"label"]
As suggested by the dataset description, the target variable is log-normal distributed.
summary(train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 130000 163000 180900 214000 755000
ggplot(train, aes(x = SalePrice)) +
geom_histogram(bins = 30) +
scale_x_continuous(trans = "log10") +
theme_bw()
Continuous Variables
The plots below visualize the correlatio between continuous variables and the target. The y axis is fixed to enable comparison between the variables. Visually X1stFlrSF and GrLivAreaseem to be important.
vars <- contVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = value)) +
geom_point(size = .5) +
scale_y_continuous(trans = "log10") +
facet_wrap(~ variable, scales = "free_x") +
theme_bw()
Discrete Variables
Here YearBuilt, YearRemodAdd, GarageYrBlt, TotRmsAbvGrd, and FullBath seem important.
vars <- discVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = as.factor(value))) +
geom_boxplot() +
scale_y_continuous(trans = "log10") +
facet_wrap(~ variable, scales = "free_x") +
theme_bw()
Ordinal Variables
Important features here are OverallQualt, ExterQual, BsmtQual, KitchenQual, basically the quality features. From these plots one could conclude that it does make sense to have these features numeric and not as a factor.
vars <- ordVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = as.factor(value))) +
geom_boxplot() +
scale_y_continuous(trans = "log10") +
facet_wrap(~ variable, scales = "free_x") +
theme_bw()
Nominal Variables
There are not many particularly intersting features here. It seems \(MSZoning = C\) and \(CentralAir = Y\) is interesting though.
vars <- nomVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = value)) +
geom_boxplot() +
scale_y_continuous(trans = "log10") +
facet_wrap(~ variable, scales = "free_x") +
theme_bw()
This is a method which repeatedly fits random forest to the training set in order to estimate variable importance. Importance here are Z-scores, their significance is estimated by random permutations. Here, the package boruta is used according to Kursa and Rudnicki. The plot below shows confirmed, tentative and rejected variables in green, yellow, and red respectively.
boruta <- Boruta(train[, -"SalePrice"], train$SalePrice, ntree = 500)
plot(boruta)
boruta
## Boruta performed 99 iterations in 3.187884 mins.
## 48 attributes confirmed important: BedroomAbvGr, BldgType,
## BsmtCond, BsmtExposure, BsmtFinSF1 and 43 more;
## 22 attributes confirmed unimportant: BsmtFinSF2, BsmtHalfBath,
## Condition1, Condition2, Electrical and 17 more;
## 10 tentative attributes left: Alley, BsmtFinType2, EnclosedPorch,
## Fence, Functional and 5 more;
The table below summarizes the procedure. The variable with the highest median Z-score by far is GrLivArea, followed by OverallQual.
df <- attStats(boruta)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>%
formatStyle('decision',
color = styleEqual(unique(df$decision), c('red', 'green', 'orange'))) %>%
formatRound(1:5, 2)
That’s it! The modeling will be done in another kernel. Thanks for reading through this whole thing.